1 Setup

1.1 Packages

library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(biobroom)
library(ggridges)

1.2 Data

1.2.1 Metabolite Abundances

# Cells #
vf.cell.neg.raw <- read_csv("./data/abundances/vpa_fa_1and2_cells_target_negmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  VPA = col_character(),
  FA = col_character(),
  Plate = col_character()
)
See spec(...) for full column specifications.
vf.cell.pos.raw <- read_csv("./data/abundances/vpa_fa_1and2_cells_target_posmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  VPA = col_character(),
  FA = col_character(),
  Plate = col_character()
)
See spec(...) for full column specifications.
# Media #
vf.med.neg.raw <- read_csv("./data/abundances/vpa_fa_1and2_media_target_negmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  VPA = col_character(),
  FA = col_character(),
  Plate = col_character()
)
See spec(...) for full column specifications.
vf.med.pos.raw <- read_csv("./data/abundances/vpa_fa_1and2_media_target_posmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  VPA = col_character(),
  FA = col_character(),
  Plate = col_character()
)
See spec(...) for full column specifications.

1.2.2 Compound Information

# Cells #
vf.cell.neg.compound.info <- read_csv("./data/compound_info/vpa_fa_1and2_cells_target_negmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
vf.cell.pos.compound.info <- read_csv("./data/compound_info/vpa_fa_1and2_cells_target_posmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
# Media #
vf.med.neg.compound.info <- read_csv("./data/compound_info/vpa_fa_1and2_media_target_negmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
vf.med.pos.compound.info <- read_csv("./data/compound_info/vpa_fa_1and2_media_target_posmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
# Kegg/other ID reference #
cmpnd.id.db <- read_csv("./data/other/anp_db_compound_kegg.csv")
Parsed with column specification:
cols(
  compound_full = col_character(),
  cas_id = col_character(),
  HMDB = col_character(),
  Metlin = col_character(),
  KEGG = col_character()
)

1.3 Functions

MissingPerSamplePlot <- function(raw.data, start.string) {
  # Counts the number of missing/NA values per sample and
  # percent compounds missing out of total number of compounds per sample
  # Then passes the result into a vertical bar plot, where each 
  # bar represents a single sample and the size of the bar 
  # is the % of compounds missing
  
  counted.na <- raw.data %>%
  select(starts_with(start.string)) %>% 
  mutate(
    count.na = apply(., 1, function(x) sum(is.na(x))),
    percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
    ) %>%
  dplyr::select(count.na, percent.na) %>%
  bind_cols(
    raw.data %>% 
      select(Samples, Group)
      ) %>% 
  arrange(percent.na) %>% 
  mutate(f.order = factor(Samples, levels = Samples))
counted.na %>% 
  ggplot(aes(x = f.order, y = percent.na, fill = Group)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
  coord_flip()+
  xlab("Samples") +
  ylab("Percent missing values in sample") +
  theme(axis.text.y = element_text(size = 6)) 
}
MissingPerCompound <- function(raw.data, start.string){
  # Function to count in how many experimental samples each compound is missing
  # Also calculates the % missing:
  # (# NA per compound across all experimental samples) * 100 / (tot num of samples)
  
  raw.data %>% 
  filter(Group == "sample") %>% 
  select(Samples, starts_with(start.string)) %>% 
  gather(key = "Compound", value = "Abundance", -Samples) %>% 
  group_by(Compound) %>% 
  summarise(
    na_count = sum(is.na(Abundance)),
    n_samples = n(),
    percent_na = (na_count * 100 / n_samples)
    ) %>% 
  filter(na_count > 0) %>% 
  arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformMult <- function(raw.dataframe, start.prefix) {
  # Function to replace any NAs in each column with the minimum for that column, 
  # separately for each sample type.
  # NA in negative control samples are replaced by 2.
  # Then data is log2 transformed
  
  # experimental samples #
  smpls <- raw.dataframe %>%
    filter(Group == "sample") %>% 
    dplyr::select(starts_with(start.prefix))
  smpls.min <- lapply(smpls, min, na.rm = TRUE)
  smpls.noNA <- raw.dataframe  %>%
    filter(Group == "sample") %>% 
    dplyr::select(Samples:Plate) %>%
    bind_cols(
      smpls %>%
        replace_na(replace = smpls.min) %>% 
        log2()
      ) 
  # QC samples #
  QC <- raw.dataframe %>%
    filter(Group == "QC") %>% 
    dplyr::select(starts_with(start.prefix))
  QC.min <- lapply(QC, min, na.rm = TRUE)
  # replace the missing values in the QC with the minimum of the QC
  # then take the log
  QC.noNA <- raw.dataframe  %>%
    filter(Group == "QC") %>% 
    dplyr::select(Samples:Plate) %>%
    bind_cols(
      QC %>%
        replace_na(replace = QC.min) %>% 
        log2()
      )
  # not samples or QC #
  other.min <- setNames(
    as.list(
      rep(2, ncol(
        raw.dataframe %>% 
          dplyr::select(starts_with(start.prefix))))
      ),
    colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
                )
  other.num.log <- raw.dataframe  %>%
    filter(Group != "QC" & Group != "sample") %>% 
    dplyr::select(Samples:Plate) %>%
    bind_cols(
      raw.dataframe %>% 
        filter(Group != "QC" & Group != "sample") %>% 
        dplyr::select(starts_with(start.prefix)) %>%
        replace_na(replace = other.min) %>% 
        log2()
      )
  all.noNA <- smpls.noNA %>% 
    bind_rows(QC.noNA) %>% 
    bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
  # function for preparing dara for heatmap viz
  x <- raw.data %>% 
    select(starts_with(start.prefix)) %>% 
    scale(center = TRUE, scale = TRUE) %>% 
    as.matrix() 
  row.names(x) <- raw.data$Samples
  return(x)
}

2 Data Exploration

2.1 Mass vs Retention Time

Q: What are the distributions of compound masses and retention times?

full.vf.cmpnd <- vf.cell.neg.compound.info %>% 
  mutate(Set = "Cells / Neg") %>% 
  bind_rows(vf.cell.pos.compound.info %>% mutate(Set = "Cells / Pos")) %>% 
  bind_rows(vf.med.neg.compound.info %>% mutate(Set = "Media / Neg")) %>%
  bind_rows(vf.med.pos.compound.info %>% mutate(Set = "Media / Pos"))
full.vf.cmpnd %>% 
  ggplot(aes(x = rt, y = mass)) +
  geom_point(size = 3, alpha = 0.3) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA + FA Exp") +
  ylim(0, 1000)

full.vf.cmpnd %>% 
  ggplot(aes(x = rt, y = mass, color = Set)) +
  geom_point(size = 3, alpha = 0.8) +
  xlab("Retnetion Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nVPA + FA Exp") +
  facet_wrap(~ Set) +
  ylim(0, 1000)

Q: Which compounds were found in one or more of the data types?

## cell join ##
vf.cell.cmpnd.join <- vf.cell.neg.compound.info %>% 
  inner_join(vf.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / cells 
print(vf.cell.cmpnd.join$compound_full.c.n)
 [1] "Methylglyoxal"                              
 [2] "Glycine"                                    
 [3] "Alanine"                                    
 [4] "Sarcosine"                                  
 [5] "GABA"                                       
 [6] "2-Aminobutyric Acid"                        
 [7] "Serine"                                     
 [8] "Hypotaurine"                                
 [9] "Uracil"                                     
[10] "Proline"                                    
[11] "Caproic Acid"                               
[12] "Valine"                                     
[13] "Methylmalonic Acid"                         
[14] "Threonine"                                  
[15] "Cysteine"                                   
[16] "Taurine"                                    
[17] "4-Oxoproline"                               
[18] "trans-4-Hydroxyproline"                     
[19] "Creatine"                                   
[20] "Isoleucine"                                 
[21] "Leucine"                                    
[22] "Asparagine"                                 
[23] "Aspartic Acid"                              
[24] "Adenine"                                    
[25] "Hypoxanthine"                               
[26] "Glutamine"                                  
[27] "Lysine"                                     
[28] "Glutamic Acid"                              
[29] "Methionine"                                 
[30] "Guanine"                                    
[31] "Xanthine"                                   
[32] "3-Sulfinoalanine"                           
[33] "Histidine"                                  
[34] "Orotic Acid"                                
[35] "Allantoin"                                  
[36] "Phenylalanine"                              
[37] "G3P"                                        
[38] "Glycerol 1-Phosphate / Glycerol 3-Phosphate"
[39] "Arginine"                                   
[40] "N-Carbamoyl-L-aspartic Acid"                
[41] "Tyrosine"                                   
[42] "D-Galactitol"                               
[43] "N-alpha-Acetylglutamine"                    
[44] "Glucuronic Acid"                            
[45] "Galacturonic Acid"                          
[46] "Tryptophan"                                 
[47] "Phosphocreatine"                            
[48] "O-Succinylhomoserine"                       
[49] "Pantothenic Acid"                           
[50] "GalNAc"                                     
[51] "Cystathionine"                              
[52] "Deoxycytidine"                              
[53] "10Z-Pentadecenoic Acid"                     
[54] "Uridine"                                    
[55] "Palmitoleic Acid"                           
[56] "Ribothymidine"                              
[57] "Thiamine"                                   
[58] "Inosine"                                    
[59] "10Z-Heptadecenoic Acid"                     
[60] "Myoinositol-methyl-phosphate"               
[61] "Oleic Acid"                                 
[62] "Guanosine"                                  
[63] "1-Monomyristin"                             
[64] "Glutathione"                                
[65] "Neu5Ac"                                     
[66] "Arachidic Acid"                             
[67] "UMP"                                        
[68] "3-Phosphoglyceroinositol"                   
[69] "AMP"                                        
[70] "GMP"                                        
[71] "SAH"                                        
[72] "CDP"                                        
[73] "ADP"                                        
[74] "Folic Acid"                                 
[75] "GDP"                                        
[76] "LysoPE(18:0)"                               
[77] "dTTP"                                       
[78] "CTP"                                        
[79] "UTP"                                        
[80] "dATP"                                       
[81] "ATP"                                        
[82] "GTP"                                        
[83] "ADP-Ribose"                                 
[84] "UDP-Glucuronic acid"                        
[85] "ADP-Glucose"                                
[86] "GDP-Glucose"                                
[87] "UDP-GalNAc"                                 
[88] "UDP-GlcNAc"                                 
[89] "GSSG"                                       
[90] "NAD"                                        
[91] "NADH"                                       
[92] "NADP"                                       
[93] "CoA"                                        
[94] "PC(36:4)"                                   
[95] "Acetyl-CoA"                                 
# percent of cell / neg compounds found in cell / pos 
round(nrow(vf.cell.cmpnd.join) * 100 / nrow(vf.cell.neg.compound.info), 1)
[1] 53.4
# percent of cell / neg compounds found in cell / pos 
round(nrow(vf.cell.cmpnd.join) * 100 / nrow(vf.cell.pos.compound.info), 1)
[1] 56.5
vf.cell.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

vf.cell.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

### Media join ###
vf.med.cmpnd.join <- vf.med.neg.compound.info %>% 
  inner_join(vf.med.pos.compound.info, by = "cas_id", suffix = c(".m.n", ".m.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / media
print(vf.med.cmpnd.join$compound_full.m.n)
 [1] "Glycine"          "Alanine"          "Serine"          
 [4] "Uracil"           "Creatinine"       "Valine"          
 [7] "Threonine"        "Taurine"          "4-Oxoproline"    
[10] "Isoleucine"       "Leucine"          "Hypoxanthine"    
[13] "Glutamine"        "Lysine"           "Glutamic Acid"   
[16] "Methionine"       "Allantoin"        "Phenylalanine"   
[19] "Pyridoxine"       "Tyrosine"         "D-Galactitol"    
[22] "Pantothenic Acid" "Uridine"          "Inosine"         
[25] "Guanosine"        "Folic Acid"      
# percent of media / neg compounds found in media / pos
round(nrow(vf.med.cmpnd.join) * 100 / nrow(vf.med.neg.compound.info), 1)
[1] 49.1
# percent of media / pos compounds found in media / neg
round(nrow(vf.med.cmpnd.join) * 100 / nrow(vf.med.pos.compound.info), 1)
[1] 51
# vf all match
vf.all.cmpnd.join <- vf.cell.cmpnd.join %>% 
  inner_join(vf.med.cmpnd.join, by = "cas_id") %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
nrow(vf.all.cmpnd.join)
[1] 24
vf.all.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

vf.all.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

2.2 Missing Values

Q: Do any of the samples have greater than 20% missing (NA) compound abundances, out of all of the features in the dataset?

MissingPerSamplePlot(vf.cell.neg.raw, "VPA_FAnC") +
  ggtitle("Missing Per Sample\nVPA + FA / Cells / Neg Mode")

MissingPerSamplePlot(vf.cell.pos.raw, "VPA_FApC") +
  ggtitle("Missing Per Sample\nVPA + FA / Cells / Pos Mode")

MissingPerSamplePlot(vf.med.neg.raw, "VPA_FAnM") +
  ggtitle("Missing Per Sample\nVPA + FA / Media / Neg Mode")

MissingPerSamplePlot(vf.med.pos.raw, "VPA_FApM") +
  ggtitle("Missing Per Sample\nVPA + FA / Media / Pos Mode")

Q: Are any of the compounds more than 20% missing in the experimental sample group? If there are any, they will be excluded from analysis.

MissingPerCompound(vf.cell.neg.raw, "VPA_FAnC") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
(vf.cell.pos.cmpnd.to.excl <- MissingPerCompound(vf.cell.pos.raw, "VPA_FApC") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound   na_count n_samples percent_na
  <chr>         <int>     <int>      <dbl>
1 VPA_FApC80       23        24       95.8
(vf.med.neg.cmpnd.to.excl <- MissingPerCompound(vf.med.neg.raw, "VPA_FAnM") %>% 
  filter(percent_na > 20))
# A tibble: 4 x 4
  Compound   na_count n_samples percent_na
  <chr>         <int>     <int>      <dbl>
1 VPA_FAnM49       24        24        100
2 VPA_FAnM50       24        24        100
3 VPA_FAnM51       24        24        100
4 VPA_FAnM53        6        24         25
(vf.med.pos.cmpnd.to.excl <- MissingPerCompound(vf.med.pos.raw, "VPA_FApM") %>% 
  filter(percent_na > 20))
# A tibble: 3 x 4
  Compound   na_count n_samples percent_na
  <chr>         <int>     <int>      <dbl>
1 VPA_FApM37       24        24        100
2 VPA_FApM41       24        24        100
3 VPA_FApM43       24        24        100

2.3 Negative Controls and Compound Elimination

2.3.1 Cells / Negative Mode

vf.cell.neg.raw.grp.mean <- vf.cell.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPA_FAnC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.cell.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA / Cells / Negative Mode\nGrouped by sample type")

vf.cell.neg.raw.grp.mean.order <- vf.cell.neg.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vf.cell.neg.raw %>% 
  select(Samples, Group, starts_with("VPA_FAnC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.cell.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA / Cells / Negative Mode")

vf.cell.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA / Cells / Negative Mode")

vf.cell.neg.raw.grp.diff <- vf.cell.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vf.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 50)

vf.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

vf.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-4, 13) +
  ylim(-1, 13) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Cells / Neg Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vf.cell.neg.cmpnd.to.incl <- vf.cell.neg.raw.grp.diff %>% 
  filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff))
# original number of metabolites
nrow(vf.cell.neg.raw.grp.diff)
[1] 178
# number of metabolites after filtering 
nrow(vf.cell.neg.cmpnd.to.incl)
[1] 131

2.3.2 Cells / Positive Mode

vf.cell.pos.raw.grp.mean <- vf.cell.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPA_FApC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.cell.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA / Cells / Positive Mode\nGrouped by sample type")

vf.cell.pos.raw.grp.mean.order <- vf.cell.pos.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
vf.cell.pos.raw %>% 
  select(Samples, Group, starts_with("VPA_FApC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.cell.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA / Cells / Positive Mode")

vf.cell.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA / Cells / Positive Mode")

vf.cell.pos.raw.grp.diff <- vf.cell.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vf.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 50)

vf.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

vf.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-5, 11) +
  ylim(-1, 11) +
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Cells / Pos Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vf.cell.pos.cmpnd.to.incl <- vf.cell.pos.raw.grp.diff %>% 
  filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff)) %>% 
  filter(!(Compound %in% vf.cell.pos.cmpnd.to.excl$Compound))  %>% 
  filter(!(Compound %in% vf.cell.pos.cmpnd.to.excl$Compound))
# original number of metabolites
nrow(vf.cell.pos.raw.grp.diff)
[1] 168
# filtered number
nrow(vf.cell.pos.cmpnd.to.incl)
[1] 120

2.3.3 Media / Negative Mode

vf.med.neg.raw.grp.mean <- vf.med.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPA_FAnM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.med.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA / Media / Negative Mode\nGrouped by sample type")

vf.med.neg.raw.grp.mean.order <- vf.med.neg.raw.grp.mean %>% 
  filter(Group == "empty") %>% 
  arrange(Grp_mean_abun)
vf.med.neg.raw %>% 
  select(Samples, Group, starts_with("VPA_FAnM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 2) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.med.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.med.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA / Media / Negative Mode")

vf.med.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA / Media / Negative Mode")

vf.med.neg.raw.grp.diff <- vf.med.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vf.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 25)

vf.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

vf.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-1, 4) +
  ylim(-1, 7) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Media / Neg Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vf.med.neg.cmpnd.to.incl <- vf.med.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>% 
  filter(!(Compound %in% vf.med.neg.cmpnd.to.excl$Compound))
nrow(vf.med.neg.raw.grp.diff)
[1] 53
nrow(vf.med.neg.cmpnd.to.incl)
[1] 43

2.3.4 Media / Negative Mode

vf.med.pos.raw.grp.mean <- vf.med.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("VPA_FApM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
vf.med.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nVPA + FA / Media / Positive Mode\nGrouped by sample type")

vf.med.pos.raw.grp.mean.order <- vf.med.pos.raw.grp.mean %>% 
  filter(Group == "empty") %>% 
  arrange(Grp_mean_abun)
vf.med.pos.raw %>% 
  select(Samples, Group, starts_with("VPA_FApM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 2) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = vf.med.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = vf.med.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA + FA / Media / Positive Mode")

vf.med.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = vf.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nVPA + FA / Media / Positive Mode")

vf.med.pos.raw.grp.diff <- vf.med.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(
    smpl_empty_diff = sample / empty,
    smpl_solv_diff = sample / solv,
    empty_solv_diff = empty / solv
    )
vf.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff))) +
  geom_histogram(bins = 25)

vf.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

vf.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
  geom_point(size = 2, alpha = 0.5) +
  xlim(-5, 1) +
  ylim(-1, 7) +
  # abline in pink
  geom_abline(intercept = 0, slope = 1, color =  "#CC79A7", size = 2, alpha = 0.6) +
  # lm line in green
  geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
  xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
  ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
  ggtitle("Background signal\nRaw Data / Media / pos Mode")

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vf.med.pos.cmpnd.to.incl <- vf.med.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>% 
  filter(!(Compound %in% vf.med.pos.cmpnd.to.excl$Compound))
nrow(vf.med.pos.raw.grp.diff)
[1] 51
nrow(vf.med.pos.cmpnd.to.incl)
[1] 32

3 Data Prep and Preliminary Analysis

3.1 Cleanup

vf.cell.neg.noNA <- vf.cell.neg.raw %>% 
  select(Samples:Plate, one_of(vf.cell.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformMult("VPA_FAnC")
vf.cell.pos.noNA <- vf.cell.pos.raw %>% 
  select(Samples:Plate, one_of(vf.cell.pos.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformMult("VPA_FApC")
vf.med.neg.noNA <- vf.med.neg.raw %>% 
  select(Samples:Plate, one_of(vf.med.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformMult("VPA_FAnM")
vf.med.pos.noNA <- vf.med.pos.raw %>% 
  select(Samples:Plate, one_of(vf.med.pos.cmpnd.to.incl$Compound)) %>%  
  ReplaceNAwMinLogTransformMult("VPA_FApM")

3.2 Distribution plots

3.2.1 Cells / Negative Mode

vf.cell.neg.noNA.gathered <- vf.cell.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.cell.neg.noNA) == "VPA_FAnC10"):ncol(vf.cell.neg.noNA)
    )
# plot all abundances in a sample, grouped by sample as a boxplot
vf.cell.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA / Cells / Negative Mode")

# same data format, but as ridge plots
vf.cell.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA / Cells / Negative Mode")
Picking joint bandwidth of 1.38

# experimental samples only
vf.cell.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA / Cells / Negative Mode")
Picking joint bandwidth of 1.1

# overlay the distributions for another look
vf.cell.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA / Cells / Negative Mode")

3.2.2 Cells / Positive Mode

vf.cell.pos.noNA.gathered <- vf.cell.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.cell.pos.noNA) == "VPA_FApC10"):ncol(vf.cell.pos.noNA)
    )
vf.cell.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA / Cells / Positive Mode")

vf.cell.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA / Cells / Positive Mode")
Picking joint bandwidth of 1.41

vf.cell.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA / Cells / Positive Mode")
Picking joint bandwidth of 1.21

vf.cell.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA / Cells / Positive Mode")

3.2.3 Media / Negative Mode

vf.med.neg.noNA.gathered <- vf.med.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.med.neg.noNA) == "VPA_FAnM1"):ncol(vf.med.neg.noNA)
    )
vf.med.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA / Media / Negative Mode")

vf.med.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA / Media / Negative Mode")
Picking joint bandwidth of 0.729

vf.med.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA / Media / Negative Mode")
Picking joint bandwidth of 0.706

vf.med.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA / Media / Negative Mode")

3.2.4 Media / Positive Mode

vf.med.pos.noNA.gathered <- vf.med.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(vf.med.pos.noNA) == "VPA_FApM1"):ncol(vf.med.pos.noNA)
    )
vf.med.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nVPA + FA / Media / Positive Mode")

vf.med.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nVPA + FA / Media / Positive Mode")
Picking joint bandwidth of 1.5

vf.med.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA + FA / Media / Positive Mode")
Picking joint bandwidth of 1.5

vf.med.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA + FA / Media / Positive Mode")

3.3 Principal Component Analysis

Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.

3.3.1 Cells / Negative Mode

### PCA on all Samples ###
vf.cell.neg.full.pca <- vf.cell.neg.noNA %>% 
  select(starts_with("VPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.neg.full.pca$sdev ^ 2) * 100 / sum(vf.cell.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA / Cells / Negative Mode",
  type = "b"
  )

vf.cell.neg.full.pca.x <- as.data.frame(vf.cell.neg.full.pca$x)
row.names(vf.cell.neg.full.pca.x) <- vf.cell.neg.noNA$Samples
vf.cell.neg.full.pca.x <- vf.cell.neg.full.pca.x %>% 
  bind_cols(vf.cell.neg.noNA %>% select(Group:Plate))
vf.cell.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (91.7% Var)") +
  ylab("PC2 (4.2% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA / Cells / Negative Mode")

### Samples and QC ###
vf.cell.neg.smpl.QC.pca <- vf.cell.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vf.cell.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA + FA / Cells / Negative Mode",
  type = "b"
  )

vf.cell.neg.smpl.QC.pca.x <- as.data.frame(vf.cell.neg.smpl.QC.pca$x)
vf.cell.neg.smpl.QC.pca.x <- vf.cell.neg.smpl.QC.pca.x %>% 
  bind_cols(
    vf.cell.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.neg.smpl.QC.pca.x) <- vf.cell.neg.smpl.QC.pca.x$Samples
vf.cell.neg.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (32.3% Var)") +
  ylab("PC2 (25.1% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Cells / Negative Mode")

vf.cell.neg.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (13.7% Var)") +
  ylab("PC4 (6.5% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Cells / Negative Mode")

### Experimental Samples Only ###
vf.cell.neg.smpl.pca <- vf.cell.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(vf.cell.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA / Cells / Negative Mode",
  type = "b"
  )

vf.cell.neg.smpl.pca.x <- as.data.frame(vf.cell.neg.smpl.pca$x)
vf.cell.neg.smpl.pca.x <- vf.cell.neg.smpl.pca.x %>% 
  bind_cols(
    vf.cell.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.neg.smpl.pca.x) <- vf.cell.neg.smpl.pca.x$Samples
vf.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (44.5% Var)") +
  ylab("PC2 (19.3% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Cells / Negative Mode")

vf.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (10.4% Var)") +
  ylab("PC4 (7.7% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Cells / Negative Mode")

3.3.2 Cells / Positive Mode

### PCA on all Samples ###
vf.cell.pos.full.pca <- vf.cell.pos.noNA %>% 
  select(starts_with("VPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.pos.full.pca$sdev ^ 2) * 100 / sum(vf.cell.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA / Cells / Positive Mode",
  type = "b"
  )

vf.cell.pos.full.pca.x <- as.data.frame(vf.cell.pos.full.pca$x)
row.names(vf.cell.pos.full.pca.x) <- vf.cell.pos.noNA$Samples
vf.cell.pos.full.pca.x <- vf.cell.pos.full.pca.x %>% 
  bind_cols(vf.cell.pos.noNA %>% select(Group:Plate))
vf.cell.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (92.0% Var)") +
  ylab("PC2 (2.7% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA / Cells / Positive Mode")

### Samples and QC ###
vf.cell.pos.smpl.QC.pca <- vf.cell.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vf.cell.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA + FA / Cells / Positive Mode",
  type = "b"
  )

vf.cell.pos.smpl.QC.pca.x <- as.data.frame(vf.cell.pos.smpl.QC.pca$x)
vf.cell.pos.smpl.QC.pca.x <- vf.cell.pos.smpl.QC.pca.x %>% 
  bind_cols(
    vf.cell.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.pos.smpl.QC.pca.x) <- vf.cell.pos.smpl.QC.pca.x$Samples
vf.cell.pos.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_", remove = FALSE) %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (38.2% Var)") +
  ylab("PC2 (20.0% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Cells / Positive Mode")

vf.cell.pos.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (12.0% Var)") +
  ylab("PC4 (7.7% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Cells / Positive Mode")

### Experimental Samples Only ###
vf.cell.pos.smpl.pca <- vf.cell.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(vf.cell.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA / Cells / Positive Mode",
  type = "b"
  )

vf.cell.pos.smpl.pca.x <- as.data.frame(vf.cell.pos.smpl.pca$x)
vf.cell.pos.smpl.pca.x <- vf.cell.pos.smpl.pca.x %>% 
  bind_cols(
    vf.cell.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.cell.pos.smpl.pca.x) <- vf.cell.pos.smpl.pca.x$Samples
vf.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (49.4% Var)") +
  ylab("PC2 (15.7% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Cells / Positive Mode")

vf.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (9.8% Var)") +
  ylab("PC4 (7.6% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Cells / Positive Mode")

3.3.3 Media / Negative Mode

### PCA on all Samples ###
vf.med.neg.full.pca <- vf.med.neg.noNA %>% 
  select(starts_with("VPA_FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.neg.full.pca$sdev ^ 2) * 100 / sum(vf.med.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA / Media / Negative Mode",
  type = "b"
  )

vf.med.neg.full.pca.x <- as.data.frame(vf.med.neg.full.pca$x)
row.names(vf.med.neg.full.pca.x) <- vf.med.neg.noNA$Samples
vf.med.neg.full.pca.x <- vf.med.neg.full.pca.x %>% 
  bind_cols(vf.med.neg.noNA %>% select(Group:Plate))
vf.med.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (84.2% Var)") +
  ylab("PC2 (5.2% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA / Media / Negative Mode")

### Samples and QC ###
vf.med.neg.smpl.QC.pca <- vf.med.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPA_FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vf.med.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA + FA / Media / Negative Mode",
  type = "b"
  )

vf.med.neg.smpl.QC.pca.x <- as.data.frame(vf.med.neg.smpl.QC.pca$x)
vf.med.neg.smpl.QC.pca.x <- vf.med.neg.smpl.QC.pca.x %>% 
  bind_cols(
    vf.med.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.neg.smpl.QC.pca.x) <- vf.med.neg.smpl.QC.pca.x$Samples
vf.med.neg.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (49.3% Var)") +
  ylab("PC2 (25.9% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Media / Negative Mode")

vf.med.neg.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.0% Var)") +
  ylab("PC4 (5.0% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Media / Negative Mode")

### Experimental Samples Only ###
vf.med.neg.smpl.pca <- vf.med.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPA_FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.neg.smpl.pca$sdev ^ 2) * 100 / sum(vf.med.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA / Media / Negative Mode",
  type = "b"
  )

vf.med.neg.smpl.pca.x <- as.data.frame(vf.med.neg.smpl.pca$x)
vf.med.neg.smpl.pca.x <- vf.med.neg.smpl.pca.x %>% 
  bind_cols(
    vf.med.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.neg.smpl.pca.x) <- vf.med.neg.smpl.pca.x$Samples
vf.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (53.5% Var)") +
  ylab("PC2 (26.6% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Negative Mode")

vf.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.6% Var)") +
  ylab("PC4 (3.6% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Negative Mode")

3.3.4 Media / Positive Mode

### PCA on all Samples ###
vf.med.pos.full.pca <- vf.med.pos.noNA %>% 
  select(starts_with("VPA_FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.pos.full.pca$sdev ^ 2) * 100 / sum(vf.med.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nVPA + FA / Media / Positive Mode",
  type = "b"
  )

vf.med.pos.full.pca.x <- as.data.frame(vf.med.pos.full.pca$x)
row.names(vf.med.pos.full.pca.x) <- vf.med.pos.noNA$Samples
vf.med.pos.full.pca.x <- vf.med.pos.full.pca.x %>% 
  bind_cols(vf.med.pos.noNA %>% select(Group:Plate))
vf.med.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (84.2% Var)") +
  ylab("PC2 (5.2% Var)") +
  ggtitle("Principal Component Analysis\nAll Samples\nVPA + FA / Media / Positive Mode")

### Samples and QC ###
vf.med.pos.smpl.QC.pca <- vf.med.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("VPA_FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vf.med.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nVPA + FA / Media / Positive Mode",
  type = "b"
  )

vf.med.pos.smpl.QC.pca.x <- as.data.frame(vf.med.pos.smpl.QC.pca$x)
vf.med.pos.smpl.QC.pca.x <- vf.med.pos.smpl.QC.pca.x %>% 
  bind_cols(
    vf.med.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.pos.smpl.QC.pca.x) <- vf.med.pos.smpl.QC.pca.x$Samples
vf.med.pos.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (62.4% Var)") +
  ylab("PC2 (16.0% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Media / Positive Mode")

vf.med.pos.smpl.QC.pca.x %>% 
  unite("Treatment", VPA:FA, sep = "_") %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.1% Var)") +
  ylab("PC4 (4.2% Var)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nVPA + FA / Media / Positive Mode")

### Experimental Samples Only ###
vf.med.pos.smpl.pca <- vf.med.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("VPA_FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (vf.med.pos.smpl.pca$sdev ^ 2) * 100 / sum(vf.med.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nVPA + FA / Media / Positive Mode",
  type = "b"
  )

vf.med.pos.smpl.pca.x <- as.data.frame(vf.med.pos.smpl.pca$x)
vf.med.pos.smpl.pca.x <- vf.med.pos.smpl.pca.x %>% 
  bind_cols(
    vf.med.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Plate)
  )
row.names(vf.med.pos.smpl.pca.x) <- vf.med.pos.smpl.pca.x$Samples
vf.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = FA, shape = VPA, label = row.names(vf.med.pos.smpl.pca.x))) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (50.1% Var)") +
  ylab("PC2 (22.0% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Positive Mode") +
  geom_text(hjust = 1)

vf.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = FA, shape = VPA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (16.8% Var)") +
  ylab("PC4 (4.4% Var)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nVPA + FA / Media / Positive Mode")

vf.med.pos.noNA <- vf.med.pos.noNA %>% 
  filter(Samples != "T1_P2_FA2")

4 Batch Effects and Signifiance Testing

4.1 VPA metabolism

Is VPA metabolised by cells?

vf.med.neg.noNA %>% 
  select(Samples:Plate, VPA_FAnM27) %>% 
  unite(col = "Treatment_Group", Group, VPA, FA, sep = "_") %>% 
  ggplot(aes(Treatment_Group, VPA_FAnM27)) +
  geom_jitter(alpha = 0.8, width = 0.1) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

vf.med.neg.noNA %>% 
  filter((Group == "sample" | Group == "empty") & VPA == "vpa") %>% 
  group_by(Group, VPA, FA) %>% 
  summarize(
    vpa.avg = mean(VPA_FAnM27),
    vpa.sd = sd(VPA_FAnM27)
  )
# A tibble: 4 x 5
# Groups:   Group, VPA [?]
  Group  VPA   FA    vpa.avg vpa.sd
  <chr>  <chr> <chr>   <dbl>  <dbl>
1 empty  vpa   cntrl    17.3 0.0517
2 empty  vpa   fa       17.3 0.0441
3 sample vpa   cntrl    17.3 0.241 
4 sample vpa   fa       17.4 0.166 

It seems likely that VPA is not getting metabolised to a great extent, but it is not possible to be sure.

4.2 Folic acid metabolism

Is folic acid metabolised by cells?

vf.med.neg.noNA %>% 
  select(Samples:Plate, VPA_FAnM52) %>% 
  unite(col = "Treatment_Group", Group, VPA, FA, sep = "_") %>% 
  ggplot(aes(Treatment_Group, VPA_FAnM52)) +
  geom_jitter(alpha = 0.8, width = 0.1) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

vf.med.neg.noNA %>% 
  filter((Group == "sample" | Group == "empty") & FA == "fa") %>% 
  group_by(Group, VPA, FA) %>% 
  summarize(
    fa.avg = mean(VPA_FAnM52),
    fa.sd = sd(VPA_FAnM52)
  )
# A tibble: 4 x 5
# Groups:   Group, VPA [?]
  Group  VPA   FA    fa.avg  fa.sd
  <chr>  <chr> <chr>  <dbl>  <dbl>
1 empty  cntrl fa      14.6 0.214 
2 empty  vpa   fa      14.4 0.0992
3 sample cntrl fa      14.4 0.370 
4 sample vpa   fa      14.6 0.361 

4.3 Cells / Negative Mode

# sample prep
vf.cell.neg.smpl.data <- vf.cell.neg.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.cell.neg.data.for.sva <- as.matrix(
  vf.cell.neg.smpl.data[, which(
    colnames(vf.cell.neg.smpl.data) == "VPA_FAnC10"
    ):ncol(vf.cell.neg.smpl.data)]
  )
row.names(vf.cell.neg.data.for.sva) <- vf.cell.neg.smpl.data$Samples
vf.cell.neg.data.for.sva <- t(vf.cell.neg.data.for.sva)
# pheno prep
vf.cell.neg.data.pheno <- as.data.frame(vf.cell.neg.smpl.data[, 5:6])
row.names(vf.cell.neg.data.pheno) <- vf.cell.neg.smpl.data$Samples
# sva calculation
vf.cell.neg.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.cell.neg.data.pheno)
vf.cell.neg.mod0 <- model.matrix(~ 1, data = vf.cell.neg.data.pheno)
set.seed(2018)
num.sv(vf.cell.neg.data.for.sva, vf.cell.neg.mod.vf, method = "be")
[1] 3
set.seed(2018)
num.sv(vf.cell.neg.data.for.sva, vf.cell.neg.mod.vf, method = "leek")
[1] 0
set.seed(2018)
vf.cell.neg.sv <- sva(vf.cell.neg.data.for.sva, vf.cell.neg.mod.vf, vf.cell.neg.mod0)
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vf.cell.neg.surr.var <- as.data.frame(vf.cell.neg.sv$sv)
colnames(vf.cell.neg.surr.var) <- c("S1", "S2", "S3")
colnames(vf.cell.neg.mod.vf) <- c("cntrl", "fa_only", "vpa_only", "fa_and_vpa")
# combine the full model matrix and the surrogate variables into one
vf.cell.neg.design.sv <- cbind(vf.cell.neg.mod.vf, vf.cell.neg.surr.var)
vf.cell.neg.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("cntrl", "fa_only", "vpa_only", "fa_and_vpa", "S1", "S2", "S3")
  )
# fit the model/design matrix
vf.cell.neg.eb <- vf.cell.neg.data.for.sva %>% 
  lmFit(vf.cell.neg.design.sv) %>% 
  contrasts.fit(vf.cell.neg.cont.mat) %>% 
  eBayes()
# pull out the results for each metabolite for each comparison
vf.cell.neg.eb.tidy <- tidy(vf.cell.neg.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
# volcano plot
vf.cell.neg.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  xlim(-2, 2)  +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA / Cells / Negative Mode")

# select statistically significant hits with a certain FC:
vf.cell.neg.hits <- vf.cell.neg.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.cell.neg.compound.info, by = "compound_short")
# how many metabolites are significant across the different contrats
table(vf.cell.neg.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0         31         31 
# significant metabolites
sort(unique(vf.cell.neg.hits$compound_full))
 [1] "2-Aminoadipic Acid"       "2,4-diene-Valproic acid" 
 [3] "3-Phosphoglyceroinositol" "3-Sulfinoalanine"        
 [5] "5-Methyl-DHF"             "ADP-Glucose"             
 [7] "AMP"                      "Aspartic Acid"           
 [9] "ATP"                      "Cystathionine"           
[11] "Cysteic Acid"             "D-Galactitol"            
[13] "dTDP"                     "dTTP"                    
[15] "G3P"                      "Galacturonic Acid"       
[17] "GalNAc"                   "Glyceric Acid"           
[19] "GMP"                      "GTP"                     
[21] "Guanosine"                "Inosine"                 
[23] "Ketoleucine"              "Lactic Acid"             
[25] "myo-Inositol"             "N-Acetylalanine"         
[27] "N-Acetylaspartic Acid"    "N-Acetylserine"          
[29] "NADH"                     "Neu5Ac"                  
[31] "Proline"                  "Thymidine"               
[33] "UMP"                      "UTP"                     
vf.cell.neg.hits.tally2 <- vf.cell.neg.hits %>% 
  group_by(compound_short, compound_full) %>% 
  count() %>% 
  filter(n == 2)
vf.cell.neg.lowFA.hits <- vf.cell.neg.hits %>% 
  filter(term == "vpa_lowFA" & !(compound_short %in% vf.cell.neg.hits.tally2$compound_short))
vf.cell.neg.highFA.hits <- vf.cell.neg.hits %>% 
  filter(term == "vpa_highFA" & !(compound_short %in% vf.cell.neg.hits.tally2$compound_short))
vf.cell.neg.both.hits <- vf.cell.neg.hits %>% 
  filter(compound_short %in% vf.cell.neg.hits.tally2$compound_short) %>% 
  arrange(compound_short, term)
vf.cell.neg.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  ggplot(aes(vpa_lowFA, vpa_highFA)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_abline(intercept = 0, slope = 1, color = "blue", alpha = 0.8)

vf.cell.neg.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  mutate(diff = vpa_highFA - vpa_lowFA) %>% 
  arrange(diff)
# A tibble: 28 x 4
   compound_full            vpa_highFA vpa_lowFA     diff
   <chr>                         <dbl>     <dbl>    <dbl>
 1 2,4-diene-Valproic acid       1.78      2.41  -0.630  
 2 5-Methyl-DHF                  1.89      2.45  -0.562  
 3 Aspartic Acid                 1.40      1.49  -0.0854 
 4 ADP-Glucose                   2.87      2.95  -0.0822 
 5 GTP                           1.27      1.34  -0.0609 
 6 3-Sulfinoalanine              1.22      1.28  -0.0521 
 7 N-Acetylaspartic Acid         1.21      1.27  -0.0516 
 8 3-Phosphoglyceroinositol      0.766     0.786 -0.0201 
 9 dTDP                          0.604     0.614 -0.0101 
10 Galacturonic Acid             0.598     0.607 -0.00882
# ... with 18 more rows
### Plotting ###
vf.cell.neg.gathered <- vf.cell.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vf.cell.neg.surr.var) %>% 
  select(Samples, VPA, FA, S1:S3, starts_with("VPA_FAnC")) %>% 
  gather(key = "Compound", value = "Abundance", VPA_FAnC10:VPA_FAnC99)
vf.cell.neg.nested <- vf.cell.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2 + S3, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vf.cell.neg.modSV.resid <- vf.cell.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, VPA, FA, Compound, .resid) %>% 
  spread(Compound, .resid)
vf.cell.neg.modSV.resid %>% 
  select(Samples:FA, one_of(unique(vf.cell.neg.hits$compound_short))) %>% 
  HeatmapPrepAlt("VPA_FAnC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA + FA Experiment / Cells / Neg Mode",
    margins = c(50, 50, 75, 30),
    k_col = 2, k_row = 2,
    labRow = unique(vf.cell.neg.hits$compound_full)
    )
### PCA - cleaned data ###
vf.cell.neg.modSV.pca <- vf.cell.neg.modSV.resid %>% 
  select(starts_with("VPA_FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vf.cell.neg.modSV.pca.x <- as.data.frame(vf.cell.neg.modSV.pca$x)
row.names(vf.cell.neg.modSV.pca.x) <- vf.cell.neg.modSV.resid$Samples
vf.cell.neg.modSV.pca.x <- vf.cell.neg.modSV.pca.x %>% 
  bind_cols(vf.cell.neg.modSV.resid %>% select(VPA:FA))
vf.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (61.2% Var)") +
  ylab("PC2 (14.7% Var)")

vf.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (5.6% Var)") +
  ylab("PC4 (4.5% Var)")

4.4 Cells / Positive Mode

# sample prep
vf.cell.pos.smpl.data <- vf.cell.pos.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.cell.pos.data.for.sva <- as.matrix(
  vf.cell.pos.smpl.data[, which(
    colnames(vf.cell.pos.smpl.data) == "VPA_FApC10"
    ):ncol(vf.cell.pos.smpl.data)]
  )
row.names(vf.cell.pos.data.for.sva) <- vf.cell.pos.smpl.data$Samples
vf.cell.pos.data.for.sva <- t(vf.cell.pos.data.for.sva)
# pheno prep
vf.cell.pos.data.pheno <- as.data.frame(vf.cell.pos.smpl.data[, 5:6])
row.names(vf.cell.pos.data.pheno) <- vf.cell.pos.smpl.data$Samples
# sva calculation
vf.cell.pos.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.cell.pos.data.pheno)
vf.cell.pos.mod0 <- model.matrix(~ 1, data = vf.cell.pos.data.pheno)
set.seed(2018)
num.sv(vf.cell.pos.data.for.sva, vf.cell.pos.mod.vf, method = "be")
[1] 2
set.seed(2018)
num.sv(vf.cell.pos.data.for.sva, vf.cell.pos.mod.vf, method = "leek")
[1] 0
set.seed(2018)
vf.cell.pos.sv <- sva(vf.cell.pos.data.for.sva, vf.cell.pos.mod.vf, vf.cell.pos.mod0)
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vf.cell.pos.surr.var <- as.data.frame(vf.cell.pos.sv$sv)
colnames(vf.cell.pos.surr.var) <- c("S1", "S2")
colnames(vf.cell.pos.mod.vf) <- c("cntrl", "fa_only", "vpa_only", "fa_and_vpa")
# combine the full model matrix and the surrogate variables into one
vf.cell.pos.design.sv <- cbind(vf.cell.pos.mod.vf, vf.cell.pos.surr.var)
vf.cell.pos.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("cntrl", "fa_only", "vpa_only", "fa_and_vpa", "S1", "S2")
  )
# fit the model/design matrix
vf.cell.pos.eb <- vf.cell.pos.data.for.sva %>% 
  lmFit(vf.cell.pos.design.sv) %>% 
  contrasts.fit(vf.cell.pos.cont.mat) %>% 
  eBayes()
# pull out the results for each metabolite for each comparison
vf.cell.pos.eb.tidy <- tidy(vf.cell.pos.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
# volcano plot
vf.cell.pos.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  xlim(-2.5, 2.5) +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA / Cells / Positive Mode")

# select statistically significant hits with a certain FC:
vf.cell.pos.hits <- vf.cell.pos.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.cell.pos.compound.info, by = "compound_short")
# how many metabolites are significant across the different contrats
table(vf.cell.pos.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0         24         28 
# significant metabolites
sort(unique(vf.cell.pos.hits$compound_full))
 [1] "3-Phosphoglyceroinositol"                   
 [2] "Adenine"                                    
 [3] "Adenosine"                                  
 [4] "ADP-Glucose"                                
 [5] "AMP"                                        
 [6] "Aspartic Acid"                              
 [7] "ATP"                                        
 [8] "Beta-Alanine"                               
 [9] "CMP"                                        
[10] "CoA"                                        
[11] "CTP"                                        
[12] "D-Galactitol"                               
[13] "Dihydrobiopterin"                           
[14] "Fucose"                                     
[15] "G3P"                                        
[16] "GalNAc"                                     
[17] "Glycerol-3-phosphocholine"                  
[18] "Glycerol 1-Phosphate / Glycerol 3-Phosphate"
[19] "Glycerol 1-phosphoserine"                   
[20] "GMP"                                        
[21] "Guanine"                                    
[22] "Guanosine"                                  
[23] "Hypoxanthine"                               
[24] "Inosine"                                    
[25] "N-Carbamoyl-L-aspartic Acid"                
[26] "NADH"                                       
[27] "NMN"                                        
[28] "PAF C18"                                    
[29] "Proline"                                    
[30] "UMP"                                        
[31] "Uracil"                                     
[32] "UTP"                                        
vf.cell.pos.hits.tally2 <- vf.cell.pos.hits %>% 
  group_by(compound_short, compound_full) %>% 
  count() %>% 
  filter(n == 2)
vf.cell.pos.lowFA.hits <- vf.cell.pos.hits %>% 
  filter(term == "vpa_lowFA" & !(compound_short %in% vf.cell.pos.hits.tally2$compound_short))
vf.cell.pos.highFA.hits <- vf.cell.pos.hits %>% 
  filter(term == "vpa_highFA" & !(compound_short %in% vf.cell.pos.hits.tally2$compound_short))
vf.cell.pos.both.hits <- vf.cell.pos.hits %>% 
  filter(compound_short %in% vf.cell.pos.hits.tally2$compound_short) %>% 
  arrange(compound_short, term)
vf.cell.pos.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  ggplot(aes(vpa_lowFA, vpa_highFA)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_abline(intercept = 0, slope = 1, color = "blue", alpha = 0.8)

vf.cell.pos.both.hits %>% 
  select(compound_full, term, FC) %>% 
  spread(key = "term", value = "FC") %>% 
  mutate(diff = vpa_highFA - vpa_lowFA) %>% 
  arrange(desc(diff))
# A tibble: 20 x 4
   compound_full             vpa_highFA vpa_lowFA      diff
   <chr>                          <dbl>     <dbl>     <dbl>
 1 ADP-Glucose                    4.08      3.35   0.726   
 2 Dihydrobiopterin               2.81      2.59   0.220   
 3 Proline                        1.64      1.45   0.188   
 4 GMP                            0.590     0.515  0.0755  
 5 Guanine                        0.460     0.416  0.0438  
 6 Glycerol 1-phosphoserine       0.748     0.722  0.0260  
 7 Glycerol-3-phosphocholine      0.224     0.214  0.0101  
 8 CMP                            0.549     0.539  0.00911 
 9 Adenosine                      0.434     0.433  0.000522
10 Guanosine                      0.460     0.472 -0.0116  
11 NMN                            0.401     0.418 -0.0168  
12 UMP                            0.548     0.577 -0.0298  
13 Hypoxanthine                   0.501     0.533 -0.0317  
14 3-Phosphoglyceroinositol       0.743     0.775 -0.0325  
15 AMP                            0.496     0.530 -0.0336  
16 Inosine                        0.516     0.566 -0.0504  
17 GalNAc                         0.575     0.652 -0.0767  
18 D-Galactitol                   0.470     0.563 -0.0933  
19 Fucose                         0.463     0.589 -0.126   
20 Aspartic Acid                  1.32      1.46  -0.140   
### Plotting ###
vf.cell.pos.gathered <- vf.cell.pos.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(vf.cell.pos.surr.var) %>% 
  select(Samples, VPA, FA, S1:S2, starts_with("VPA_FApC")) %>% 
  gather(key = "Compound", value = "Abundance", VPA_FApC10:VPA_FApC99)
vf.cell.pos.nested <- vf.cell.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  mutate(augment_model = map(model, augment))
vf.cell.pos.modSV.resid <- vf.cell.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, VPA, FA, Compound, .resid) %>% 
  spread(Compound, .resid)
vf.cell.pos.modSV.resid %>% 
  select(Samples:FA, one_of(unique(vf.cell.pos.hits$compound_short))) %>% 
  HeatmapPrepAlt("VPA_FApC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nVPA + FA Experiment / Cells / Positive Mode",
    margins = c(50, 50, 75, 30),
    k_col = 2, k_row = 2,
    labRow = unique(vf.cell.pos.hits$compound_full)
    )
### PCA - cleaned data ###
vf.cell.pos.modSV.pca <- vf.cell.pos.modSV.resid %>% 
  select(starts_with("VPA_FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
vf.cell.pos.modSV.pca.x <- as.data.frame(vf.cell.pos.modSV.pca$x)
row.names(vf.cell.pos.modSV.pca.x) <- vf.cell.pos.modSV.resid$Samples
vf.cell.pos.modSV.pca.x <- vf.cell.pos.modSV.pca.x %>% 
  bind_cols(vf.cell.pos.modSV.resid %>% select(VPA:FA))
vf.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (60.6% Var)") +
  ylab("PC2 (11.6% Var)")

vf.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = VPA, shape = FA)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (7.4% Var)") +
  ylab("PC4 (4.5% Var)")

4.5 Media / Negative Mode

# sample prep
vf.med.neg.smpl.data <- vf.med.neg.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.med.neg.data.for.sva <- as.matrix(
  vf.med.neg.smpl.data[, which(
    colnames(vf.med.neg.smpl.data) == "VPA_FAnM1"
    ):ncol(vf.med.neg.smpl.data)]
  )
row.names(vf.med.neg.data.for.sva) <- vf.med.neg.smpl.data$Samples
vf.med.neg.data.for.sva <- t(vf.med.neg.data.for.sva)
# pheno prep
vf.med.neg.data.pheno <- as.data.frame(vf.med.neg.smpl.data[, 5:6])
row.names(vf.med.neg.data.pheno) <- vf.med.neg.smpl.data$Samples
# sva calculation
vf.med.neg.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.med.neg.data.pheno)
vf.med.neg.mod0 <- model.matrix(~ 1, data = vf.med.neg.data.pheno)
set.seed(2018)
num.sv(vf.med.neg.data.for.sva, vf.med.neg.mod.vf, method = "be")
[1] 1
set.seed(2018)
num.sv(vf.med.neg.data.for.sva, vf.med.neg.mod.vf, method = "leek")
[1] 0
set.seed(2018)
vf.med.neg.sv <- sva(vf.med.neg.data.for.sva, vf.med.neg.mod.vf, vf.med.neg.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vf.med.neg.surr.var <- as.data.frame(vf.med.neg.sv$sv)
colnames(vf.med.neg.surr.var) <- c("S1")
colnames(vf.med.neg.mod.vf) <- c("cntrl", "fa_only", "vpa_only", "fa_and_vpa")
# combine the full model matrix and the surrogate variables into one
vf.med.neg.design.sv <- cbind(vf.med.neg.mod.vf, vf.med.neg.surr.var)
vf.med.neg.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("cntrl", "fa_only", "vpa_only", "fa_and_vpa", "S1")
  )
# fit the model/design matrix
vf.med.neg.eb <- vf.med.neg.data.for.sva %>% 
  lmFit(vf.med.neg.design.sv) %>% 
  contrasts.fit(vf.med.neg.cont.mat) %>% 
  eBayes()
# pull out the results for each metabolite for each comparison
vf.med.neg.eb.tidy <- tidy(vf.med.neg.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
# volcano plot
vf.med.neg.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA / meds / Negative Mode")

# select statistically significant hits with a certain FC:
vf.med.neg.hits <- vf.med.neg.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.med.neg.compound.info, by = "compound_short")
# how many metabolites are significant across the different contrats
table(vf.med.neg.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0          1          1 
# significant metabolites
sort(unique(vf.med.neg.hits$compound_full))
[1] "Valproic acid"
vf.med.neg.hits
# A tibble: 2 x 14
  compound_short term  estimate statistic  p.value   lod adj_pval    FC
  <chr>          <fct>    <dbl>     <dbl>    <dbl> <dbl>    <dbl> <dbl>
1 VPA_FAnM27     vpa_~     3.98      45.5 1.62e-22  41.7 2.08e-20  15.8
2 VPA_FAnM27     vpa_~     3.99      45.4 1.66e-22  41.7 2.14e-20  15.9
# ... with 6 more variables: change_in_vpa <chr>, compound_full <chr>,
#   formula <chr>, mass <dbl>, rt <dbl>, cas_id <chr>

4.6 Media / Positive Mode

# sample prep
vf.med.pos.smpl.data <- vf.med.pos.noNA %>% 
  filter(Group == "sample") %>% 
  unite("Treatment", VPA:FA, sep = "_")
vf.med.pos.data.for.sva <- as.matrix(
  vf.med.pos.smpl.data[, which(
    colnames(vf.med.pos.smpl.data) == "VPA_FApM1"
    ):ncol(vf.med.pos.smpl.data)]
  )
row.names(vf.med.pos.data.for.sva) <- vf.med.pos.smpl.data$Samples
vf.med.pos.data.for.sva <- t(vf.med.pos.data.for.sva)
# pheno prep
vf.med.pos.data.pheno <- as.data.frame(vf.med.pos.smpl.data[, 5:6])
row.names(vf.med.pos.data.pheno) <- vf.med.pos.smpl.data$Samples
# sva calculation
vf.med.pos.mod.vf <- model.matrix(~ 0 + as.factor(Treatment), data = vf.med.pos.data.pheno)
vf.med.pos.mod0 <- model.matrix(~ 1, data = vf.med.pos.data.pheno)
set.seed(2018)
num.sv(vf.med.pos.data.for.sva, vf.med.pos.mod.vf, method = "be")
[1] 1
set.seed(2018)
num.sv(vf.med.pos.data.for.sva, vf.med.pos.mod.vf, method = "leek")
[1] 0
set.seed(2018)
vf.med.pos.sv <- sva(vf.med.pos.data.for.sva, vf.med.pos.mod.vf, vf.med.pos.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
vf.med.pos.surr.var <- as.data.frame(vf.med.pos.sv$sv)
colnames(vf.med.pos.surr.var) <- c("S1")
colnames(vf.med.pos.mod.vf) <- c("cntrl", "fa_only", "vpa_only", "fa_and_vpa")
# combine the full model matrix and the surrogate variables into one
vf.med.pos.design.sv <- cbind(vf.med.pos.mod.vf, vf.med.pos.surr.var)
vf.med.pos.cont.mat <- makeContrasts(
  vpa_lowFA = vpa_only - cntrl, 
  vpa_highFA = fa_and_vpa - fa_only,
  FA_diff = (fa_and_vpa - fa_only) - (vpa_only - cntrl),
  levels = c("cntrl", "fa_only", "vpa_only", "fa_and_vpa", "S1")
  )
# fit the model/design matrix
vf.med.pos.eb <- vf.med.pos.data.for.sva %>% 
  lmFit(vf.med.pos.design.sv) %>% 
  contrasts.fit(vf.med.pos.cont.mat) %>% 
  eBayes()
# pull out the results for each metabolite for each comparison
vf.med.pos.eb.tidy <- tidy(vf.med.pos.eb) %>% 
  mutate(
    adj_pval = p.adjust(p.value, method = "bonferroni"),
    FC = 2 ^ estimate,
    change_in_vpa = ifelse(FC < 1, "down", "up")
    ) %>% 
  rename(compound_short = gene)
# volcano plot
vf.med.pos.eb.tidy %>% 
  ggplot(aes(estimate, -log10(adj_pval))) +
  geom_point(size = 2, alpha = 0.5) +
  geom_hline(linetype = "dashed", color = "#009E73", yintercept = -log10(0.05)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1.2)) +
  geom_vline(linetype = "dashed", color = "#CC79A7", xintercept = log2(1/1.2)) +
  xlab("log2(FC)") +
  ylab("-log10(adjusted p-value)") +
  ggtitle("Volcano plot\nVPA + FA / meds / posative Mode")

# select statistically significant hits with a certain FC:
vf.med.pos.hits <- vf.med.pos.eb.tidy %>% 
  filter(adj_pval < 0.05 & (FC > 1.2 | FC < 1/1.2)) %>% 
  inner_join(vf.med.pos.compound.info, by = "compound_short")
# how many metabolites are significant across the different contrats
table(vf.med.pos.hits$term)

   FA_diff vpa_highFA  vpa_lowFA 
         0          0          0